House Hold Energy Data - Time Series
This data was collected for an apartment unit in San Jose for one plus year. The data is collected with smart meters and shared by the energy company. This is time-series data by nature and can be used for various time-series Machine Learning experiments.
The data contains eight attributes.
TYPE - This is an information column. The value is ‘Electric usage’
for all the observations.
DATE - Date of electric consumption.
There is no timestamp in this field.
START TIME - Start time of the
consumption.
END TIME - End time of the consumption
USAGE -
Consumption in kWh
UNITS - This column denotes measurement unit. It
is kWh for all the observations.
COST - Cost of consumption in $.
NOTES - Mostly an empty column.
library(stringr)
library(ggplot2)
library(ggthemes)
library(dplyr)
library(urca)
library(tseries)
library(fpp2)
library(forecast)
options(warn=-1)
data<-read.csv("/Users/shubhammishra/Desktop/VII Semester/ECO619- Time Series Analysis and Forecasting/Assignment/Data/archive/D202.csv")
data$COST<-as.numeric(gsub("$", "",as.character(data$COST), fixed = T))
data$year <- gsub("/", "",as.character(data$DATE), fixed = T)
data$year <- str_sub(data$year,-4)
data<-subset(data, year == '2017')
data$time<-as.character(paste(data$DATE, data$START.TIME))
data <- data[!names(data) %in% c("year", "TYPE", "DATE", "START.TIME", "END.TIME", "UNITS", "NOTES")]
colnames(data)<-c("use","cost","time")
data <- data %>% select("time",everything())
Nth.ddatate<-function(dataframe, n)dataframe[-(seq(n,to=nrow(dataframe),by=n)),]
i <- 4
dummy<-data
while(i >= 2){
dummy<-Nth.ddatate(dummy, i)
i <- i - 1
}
i <- 4
data<-data %>%
group_by(time = gl(ceiling(nrow(data)/i), i, nrow(data))) %>%
summarise_each(funs(sum))
data$time<-dummy$time
rm(list = setdiff(ls(), c("data")))
summary(data)
## time use cost
## Length:8760 Min. :0.0000 Min. :0.00000
## Class :character 1st Qu.:0.1200 1st Qu.:0.04000
## Mode :character Median :0.1600 Median :0.04000
## Mean :0.4835 Mean :0.09604
## 3rd Qu.:0.4000 3rd Qu.:0.08000
## Max. :9.4400 Max. :2.52000
str(data)
## tibble [8,760 × 3] (S3: tbl_df/tbl/data.frame)
## $ time: chr [1:8760] "1/1/2017 0:00" "1/1/2017 1:00" "1/1/2017 2:00" "1/1/2017 3:00" ...
## $ use : num [1:8760] 1.48 1.4 1.4 1.4 1.32 1.36 1.36 1.32 1.28 0.36 ...
## $ cost: num [1:8760] 0.28 0.24 0.24 0.24 0.24 0.24 0.24 0.24 0.24 0.08 ...
df<-data
ggplot()+
geom_line(data=df, aes(x = time, y = use, group=1)) + ylab('Usage')+xlab('Time index') + theme_clean()
count_ma<-ts(df$use,frequency = 8760/12)
decomp<-stl(count_ma,"periodic")
deseasonal_cnt<-seasadj(decomp) #Returns seasonally adjusted data constructed by removing the seasonal component
plot(decomp) #seasonality visible
plot(df$use, type='l') #visual check for stationary variance
adf.test(count_ma, alternative = 'stationary') #adf test
##
## Augmented Dickey-Fuller Test
##
## data: count_ma
## Dickey-Fuller = -14.801, Lag order = 20, p-value = 0.01
## alternative hypothesis: stationary
Acf(count_ma, main='', lag.max = 30) #corr btw the time-series and its lag
Pacf(count_ma, main='', lag.max = 30) #corr btw the time-series and its lag using partial autocorrelation function
ss <- 1 #difference counter
count_d1 <- diff(deseasonal_cnt, differences = ss)
plot(count_d1) #visual check
adf.test(count_d1, alternative = "stationary") #adf test
##
## Augmented Dickey-Fuller Test
##
## data: count_d1
## Dickey-Fuller = -35.857, Lag order = 20, p-value = 0.01
## alternative hypothesis: stationary
Acf(count_d1, main='', lag.max = 30) #corr btw the time-series and its lag
Pacf(count_d1, main='', lag.max = 30) #corr btw the time-series and its lag using partial autocorrelation function
fit1 <- auto.arima(deseasonal_cnt, seasonal = FALSE) #auto mode gives the most like version for arima
fit1 #gives ARIMA(0,1,1)
## Series: deseasonal_cnt
## ARIMA(0,1,1)
##
## Coefficients:
## ma1
## -0.9773
## s.e. 0.0033
##
## sigma^2 = 0.7356: log likelihood = -11084.93
## AIC=22173.86 AICc=22173.86 BIC=22188.02
tsdisplay(residuals(fit1), lag.max = 30, main = 'Model Residuals from ARIMA(0,1,1)') #checking the residuals
fit2 <- arima(deseasonal_cnt, order=c(2,0,24)) # from previous plot - experimental values
fit2 #is the AIC lower?
##
## Call:
## arima(x = deseasonal_cnt, order = c(2, 0, 24))
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3 ma4 ma5 ma6
## 0.5604 0.4110 -0.1496 -0.5702 -0.1967 -0.0382 0.0122 -0.0151
## s.e. 0.1292 0.1277 0.1288 0.0757 0.0452 0.0148 0.0128 0.0129
## ma7 ma8 ma9 ma10 ma11 ma12 ma13 ma14
## -0.0565 -0.0052 0.0326 -0.0289 0.0671 0.1173 -0.0023 -0.0622
## s.e. 0.0130 0.0141 0.0129 0.0133 0.0134 0.0168 0.0163 0.0141
## ma15 ma16 ma17 ma18 ma19 ma20 ma21 ma22
## -0.0224 -0.0060 0.0201 -0.0317 -0.0248 -0.0167 0.0083 0.0324
## s.e. 0.0142 0.0129 0.0129 0.0124 0.0150 0.0130 0.0130 0.0131
## ma23 ma24 intercept
## 0.0206 0.0420 0.4930
## s.e. 0.0132 0.0118 0.0367
##
## sigma^2 estimated as 0.6106: log likelihood = -10269.42, aic = 20594.85
tsdisplay(residuals(fit2), lag.max = 30, main = 'Model Residuals from ARIMA(2,0,24)')
autoplot(forecast(fit1))
hold<-window(ts(deseasonal_cnt),start(8000))
mod_fit<- arima(ts(deseasonal_cnt[-c(8000:8760)]), order = c(2,0,24))
ff<-forecast(mod_fit, h=759)
plot(ff, main="") #plotting the forecast
lines(ts(deseasonal_cnt)) #actual value over the forecast values
sfit1 <- auto.arima(deseasonal_cnt, seasonal = TRUE)
tsdisplay(residuals(sfit1), lag.max = 30, main = 'Residuals from auto ARIMA with seasonality')
sff<-forecast(sfit1, h=760)
plot(sff, main="Auto Arima with seasonality")
lines(ts(deseasonal_cnt))
sfit1 <- auto.arima(deseasonal_cnt, seasonal = TRUE)
tsdisplay(residuals(sfit1), lag.max = 30, main = 'Model Residuals from auto ARIMA with seasonality')
fit1 <- auto.arima(deseasonal_cnt, seasonal = FALSE)
tsdisplay(residuals(fit1), lag.max = 30, main = 'Model Residuals from ARIMA(0,1,1)')
fit2 <- arima(deseasonal_cnt, order=c(2,0,24))
tsdisplay(residuals(fit2), lag.max = 30, main = 'Model Residuals from ARIMA(2,0,24)')
fit3 <- arima(deseasonal_cnt, order=c(1,1,1))
tsdisplay(residuals(fit2), lag.max = 30, main = 'Model Residuals from ARIMA(1,1,1)')
fit10 <- ets(df$use) #ETS model of the original data
plot(fit10)
tsdisplay(residuals(fit10), lag.max = 30, main = 'ETS model')
par(mfrow=c(2,3))
sff1<-forecast(sfit1, h=760)
plot(sff1)
lines(ts(deseasonal_cnt))
ff1<-forecast(fit1, h=760)
plot(ff1)
lines(ts(deseasonal_cnt))
ff2<-forecast(fit2, h=760)
plot(ff2)
lines(ts(deseasonal_cnt))
ff3<-forecast(fit3, h=760)
plot(ff3)
lines(ts(deseasonal_cnt))
ff10<-forecast(fit10, h=760)
plot(ff10)
lines(ts(deseasonal_cnt))
accuracy(sfit1) #auto arima with seasonality
## ME RMSE MAE MPE MAPE MASE
## Training set -0.002743618 0.7902171 0.4319475 15.42881 302.9777 0.8019148
## ACF1
## Training set -0.001635
accuracy(fit1) #auto arima without seasonality
## ME RMSE MAE MPE MAPE MASE
## Training set -0.002649596 0.8575934 0.4841065 4.942987 361.1992 0.8987486
## ACF1
## Training set 0.3609823
accuracy(fit2) #arima(2,0,24)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.001707066 0.7813777 0.4239704 26.4341 310.2939 0.8740648
## ACF1
## Training set -0.001212339
accuracy(fit3) #arima(1,1,1)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.00349833 0.7973942 0.4284029 15.34037 311.1938 0.8832028
## ACF1
## Training set 0.03951601
accuracy(fit10) #ets model
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.002375444 0.8952574 0.474522 -Inf Inf 1.193445 0.3560017